library(data.table)
library(ggplot2)
library(plyr)
library(magrittr)
library(readr)
library(stringr)
library(CellBarcode)
library(ggrepel)
library(ROCR)
library(ggsci)
theme0 <- theme_bw() + theme(
text = element_text(size = 15),
line = element_line(size = 1),
axis.line = element_line(size = 1),
axis.ticks.length = unit(3, units = "mm"),
axis.text.x = element_text(
margin = margin(t = 2, unit = "mm")
, angle = 60, vjust = 1, size = 15, hjust = 1),
axis.text.y = element_text(margin = margin(r = 3, l = 5, unit = "mm")),
legend.position = "right",
)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
theme0_lt = theme0 + theme(legend.position = "top")
theme1 = theme0 + theme(
axis.text.x = element_text(
margin = margin(t = 2, unit = "mm")
, angle = 0, vjust = 1, size = 12, hjust = 0.5)
)
knitr::opts_chunk$set(fig.width=20, fig.height=12)
Sample info
sample_info = fread("../run_simulation_no_umi/non_umi_simu_design_matrix.tsv")
sample_info$simu_id %<>% as.character
setkey(sample_info, "simu_id")
i_level_simu_name = sample_info[order(as.integer(simu_id)), simu_name][c(1:26)]
Clone size disbribution
true_barcode_l = read_rds("../run_preprocess_simulation/tmp/non_umi_simulation_ref.rds")
d_true_barcode = true_barcode_l %>% rbindlist(idcol = "simu_file")
x = d_true_barcode$simu_file %>% str_match("barcode_simulation_\\d+_simu_seq_(.*)") %>% extract(, c(2))
table(x, useNA = "always")
## x
## 1 10 11 12 13 14 15 16 17 18 19 2 20
## 8999 8998 8999 8998 8999 8999 8998 9000 8997 8998 8863 4499 8854
## 21 22 23 24 25 26 27 28 3 4 5 6 7
## 8999 6861 6869 6855 11971 20058 17997 35978 17989 35981 8998 8999 9000
## 8 9 <NA>
## 8999 8996 0
d_true_barcode = cbind(d_true_barcode, simu_id = x)
# d_true_barcode[simu_name == "hamming" & grepl("simulation_19_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode" & grepl("simulation_1_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode"]
d_true_barcode$simu_name = sample_info[d_true_barcode$simu_id, simu_name]
table(d_true_barcode$simu_name, useNA = "always")
##
## barcode_length_10
## 8999
## barcode_size_mean_3_variation_1_clone_n_1200
## 35978
## barcode_size_mean_3_variation_1_clone_n_600
## 17997
## base
## 8999
## clone_n_1200
## 35981
## clone_n_150
## 4499
## clone_n_600
## 17989
## cycle_20
## 8996
## cycle_40
## 8998
## efficiency_0.5
## 8999
## efficiency_0.9
## 8998
## error_1e-5
## 8999
## error_1e-7
## 8999
## hamming
## 8863
## hamming_size_variation_3
## 8854
## ngs_profile_MSv1
## 8998
## pcr_read_100
## 9000
## pcr_read_1000
## 8997
## pcr_read_25
## 8998
## size_mean_0.6
## 8998
## size_mean_3
## 8999
## size_variation_1
## 9000
## size_variation_3
## 8999
## vdj_barcode
## 6861
## vdj_barcode_size_mean_3_variation_1_clone_n_1200
## 20058
## vdj_barcode_size_mean_3_variation_1_clone_n_600
## 11971
## vdj_barcode_size_variation_1
## 6869
## vdj_barcode_size_variation_3
## 6855
## <NA>
## 0
d_true_barcode = rename(d_true_barcode, c("seq" = "barcode_seq"))
## Barcode frequency
ggplot(d_true_barcode[, .(freq = sum(freq)), by = .(simu_file, barcode_seq, simu_name)]) + aes(x = simu_name, y = freq) +
geom_boxplot() +
# geom_jitter(alpha = 0.2) +
theme0 + scale_y_log10() +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 53975 rows containing missing values (`stat_boxplot()`).

AUC
# NOTE: input
d_auc = fread("./tmp/table_auc_no_umi_reference.tsv")
d_count_true_barcode = fread("./tmp/table_count_true_barcode_reference.tsv")
count - true barcode
ggplot(d_count_true_barcode) + aes(x = simu_name, y = count, color = factor(is_true)) +
geom_boxplot(alpha = 0.2) +
theme0_lt + scale_y_log10() + scale_color_npg() +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 60505 rows containing missing values (`stat_boxplot()`).
## Warning: Removed 34 rows containing non-finite values (`stat_boxplot()`).

AUC versus simu conditions
ggplot(d_auc) + aes(x = simu_name, y = auc) + geom_boxplot() + theme0 +
scale_x_discrete(limits = i_level_simu_name) +
labs(y = "AUC") + lims(y = c(0, 1))
## Warning: Removed 60 rows containing missing values (`stat_boxplot()`).

ggplot(d_auc) + aes(x = simu_name, y = aucpr) + geom_boxplot() + theme0 +
scale_x_discrete(limits = i_level_simu_name) +
labs(y = "P-R AUC") + lims(y = c(0, 1))
## Warning: Removed 60 rows containing missing values (`stat_boxplot()`).

Resolusion index
d_count_true_barcode = fread("./tmp/table_count_true_barcode_reference.tsv")
# d_count_true_barcode[, count_cat := cut(clone_size, breaks = c(0, 1, 2, 3, 4, 5, 10, 20, Inf))]
# lapply(1:100, function(i) {
# d_count_true_barcode[, sum(count < i & is_true), ,by = .(clone_size, simu_file)]
# })
Apply the autothreshold strategy
# NOTE: input
d = fread("./tmp/table_no_umi_auto_threshold_predict_rate_reference.tsv")
# ggplot(d) + aes(x = tpr, y = fpr, label = simu_name) + geom_point() +
# geom_label_repel(alpha = 0.5)
d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value")
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value")
ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) +
geom_boxplot() + theme0 +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = 1 - value) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

Apply the threshold of 0.0001 of left reads
# NOTE: input
d = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_reference.tsv")
# ggplot(d) + aes(x = tpr, y = fpr, label = simu_name) + geom_point() +
# geom_label_repel(alpha = 0.5)
d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value")
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value")
ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) +
geom_boxplot() + theme0 +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = value) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

Merge threshold
d1 = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_reference.tsv")[, resolution := "res0.0001"]
d2 = fread("./tmp/table_no_umi_p00001_threshold_predict_rate_reference.tsv")[, resolution := "res0.00001"]
d = rbindlist(list(d1, d2))
d = melt(d, id.vars = c("simu_name", "resolution"), measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value")
d$resolution %<>% factor(levels = c("res0.0001", "res0.00001"))
ggplot(d) + aes(x = simu_name, y = value, color = resolution) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) + scale_color_npg() +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 240 rows containing missing values (`stat_boxplot()`).
